## DataViz 2.0 Workshop
## Part 2

library(tidyverse)
## ── Attaching packages ─────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggpubr); library(ggrepel)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
##     set_names
## The following object is masked from 'package:tidyr':
##
##     extract
## Data Import
gene_loc <- read.table("GSE69360.gene-locations.txt",
                       header = T)

## Plotting
scatter <- ggplot(gene_loc, aes(x=End-Start, y=Length, group=Chr, color=Chr)) +
  geom_point()
scatter

### It is hard to visualize the entire data.
### Let's pretend we are only interested in a small set of chromosomes.
### Let's subset the data and add a few variables!

target <- c("chrX", "chrY", "chrM", "chr17")
gene_loc2 <- filter(gene_loc, Chr %in% target)

log_EndStart <- log10(gene_loc2$End-gene_loc2$Start)
log_length <- log10(gene_loc2$Length)
gene_loc2$log_length <- log_length
gene_loc2$log_EndStart <- log_EndStart
head(gene_loc2)
##            Geneid   Chr  Start    End Strand Length log_length log_EndStart
## 1 ENSG00000273288 chr17   4961   5048      -     88   1.944483     1.939519
## 2 ENSG00000272636 chr17   5810   6168      -   1480   3.170262     2.553883
## 3 ENSG00000273172 chr17  33615  34249      -   1185   3.073718     2.802089
## 4 ENSG00000181031 chr17  62293  63714      -   5953   3.774736     3.152594
## 5 ENSG00000262920 chr17 171183 171422      +    432   2.635484     2.378398
## 6 ENSG00000262061 chr17 180996 183279      +   2284   3.358696     3.358506
## Now let's check the new scatter plot... mmm still not the best
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point()
scatter

## the gray background is annoying... remove it!
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point() +
  theme_bw()
scatter

### recap, try a different geometry .. by yourself!
box1 <- ggplot(gene_loc2, aes(x = Chr, y = Length, group=Chr, color=Chr)) +
  geom_boxplot() +
  theme_bw()
box1

## adjust the axes
scatter <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point() +
  theme_bw() +
  xlim(0, 2500)+ ylim(0, 10000)
scatter
## Warning: Removed 392 rows containing missing values (geom_point).

### where are the green dots?
scatter3 <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point(alpha = 0.7, size =0.5) +
  theme_bw() +
  xlim(0, 2500)+ ylim(0, 10000)
scatter3
## Warning: Removed 392 rows containing missing values (geom_point).

## Did it change? compare the plots side-by-side
ggarrange(scatter, scatter3,
          labels = c("A", "B"),
          ncol = 2, nrow = 1)
## Warning: Removed 392 rows containing missing values (geom_point).

## Warning: Removed 392 rows containing missing values (geom_point).

## transformed the axes.. Thats better.. isn't it?
trans_scatter <- scatter +
  scale_x_log10("End-Start") +
  scale_y_log10("Gene length") +
  theme_minimal()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
trans_scatter

## You want to add the regression lines.. So, lets do a multiple regression
scatter1 <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color=Chr)) +
  geom_point() +
  theme_bw() +
  geom_smooth(method=lm,  se=FALSE)
scatter1

## We can't see the lines clearly. Can you think of a solution?
scatter2 <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color=Chr)) +
  geom_point(size =1, alpha = 0.2) +
  geom_smooth(method=lm, se=FALSE) +
  theme_bw()
scatter2

##Can you put them together in the same graph to compare?
ggarrange(scatter1, scatter2,
          labels = c("A", "B"),
          ncol = 2, nrow = 1)

## Now, lets add some numerical values to the graph. Like R^2
scatter <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color = Chr))+
  geom_point() +
  theme_bw() +
  geom_smooth(method = lm, se = FALSE)+
  ggpubr::stat_cor()
scatter

## Now, lets add some numerical values to the graph. Linear equation
scatter <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color = Chr))+
  geom_point() +
  geom_smooth(method = lm, se = FALSE)+
  ggpubr::stat_regline_equation()
scatter

## Your boss wants to see the lines in different plots!
## multiple regression with equation and r2 different plots
ml_scatter <- ggscatter(gene_loc2, x="log_EndStart", y="log_length",
                        color = "Chr", palette = "jco",
                        add = "reg.line", add.params = list(color = "black")) +
  facet_wrap(~Chr) +
  stat_cor(label.y = 4.4) +
  stat_regline_equation(label.y = 4.2)
ml_scatter

## labeling a point in a scatterplot..
scatter <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point()
scatter

## that gene in the corner looks interesting!! What gene is it?
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
  geom_point()+
  geom_text(label=gene_loc2$Geneid, size = 2, color="black")
scatter

## second example!! Laballing point and adding confidence interval to a regresion.
a <- gene_loc %>%
  group_by(Chr) %>%
  summarize(meanLength = mean(Length), numGenes = n())
head(a)
## # A tibble: 6 x 3
##   Chr   meanLength numGenes
##   <fct>      <dbl>    <int>
## 1 chr1       2258.     5363
## 2 chr10      2160.     2260
## 3 chr11      2218.     3208
## 4 chr12      2342.     2818
## 5 chr13      1875.     1217
## 6 chr14      1892.     2244
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
  geom_point()+
  theme_bw()
scatter2

## which chromosome is represented by which point?
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
  geom_point()+
  theme_bw()+
  geom_text(label=a$Chr, size = 2, color="black")
scatter2

## geom_text_repel is a better function!
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
  geom_point()+
  theme_bw()+
  geom_text_repel(aes(label = Chr), color="red", segment.color="blue")
scatter2

## add confidence interval
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
  geom_point()+
  theme_bw()+
  geom_text_repel(aes(label = Chr), color="red", segment.color="blue")+
  geom_smooth(method = loess, color = "lightblue", alpha = 0.1)
scatter2